import pandas as pd
import os
from directory_tree import display_tree
from sklearn.metrics import roc_auc_score
from pathlib import Path
import duckdb
import ibis
from ibis import _
from ibis import selectors as s
import re
from importlib import reload
from IPython.display import display, HTML
# reload(tools)
from tools import FeatureDefinition, pretty_display
import altair as alt
from functools import reduceDATA_DIR = 'home-credit-credit-risk-model-stability/parquet_files/train'
p = Path(DATA_DIR)
ibis.options.interactive=True
# load data into directory
data_files=list(p.glob('*.parquet'))#load_all data
ds = {re.sub('.parquet','',f.name):ibis.read_parquet(f) for f in data_files}Why train_static? Please this dataset don’t seems to have cardinality problems.
from importlib import reload
from IPython.display import display, HTML
# reload(tools)
from tools import FeatureDefinition, pretty_display
# lookup column definition here if you don't know what
intersting_cols = [
'bankacctype_710L',
'credtype_322L','disbursementtype_67L',
'inittransactioncode_186L','lastst_736L','paytype1st_925L','paytype_783L',
'twobodfilling_608L','typesuite_864L'
]
pretty_display(
FeatureDefinition().lookup_col(intersting_cols)
)Variable.str.contains("bankacctype_710L") or Variable.str.contains("credtype_322L") or Variable.str.contains("disbursementtype_67L") or Variable.str.contains("inittransactioncode_186L") or Variable.str.contains("lastst_736L") or Variable.str.contains("paytype1st_925L") or Variable.str.contains("paytype_783L") or Variable.str.contains("twobodfilling_608L") or Variable.str.contains("typesuite_864L")
| Variable | Description | File | |
|---|---|---|---|
| 0 | bankacctype_710L | Type of applicant's bank account. | [train_static_0_0, train_static_0_1] |
| 1 | credtype_322L | Type of credit. | [train_static_0_0, train_static_0_1] |
| 2 | disbursementtype_67L | Type of disbursement. | [train_static_0_0, train_static_0_1] |
| 3 | inittransactioncode_186L | Transaction type of the initial credit transaction. |
[train_static_0_0, train_static_0_1] |
| 4 | lastst_736L | Status of the client's previous credit application. |
[train_static_0_0, train_static_0_1] |
| 5 | paytype1st_925L | Type of first payment of the client. | [train_static_0_0, train_static_0_1] |
| 6 | paytype_783L | Type of payment. | [train_static_0_0, train_static_0_1] |
| 7 | twobodfilling_608L | Type of application process. | [train_static_0_0, train_static_0_1] |
| 8 | typesuite_864L | Persons accompanying the client during the loan application process. |
[train_static_0_0, train_static_0_1] |
Time Sensitivity?
Maybe not to include time-features in as training dataset.
import matplotlib.pyplot as plt
import seaborn as sns
import tools
d=(
ds.get('train_base')
.mutate(date_decision=_.date_decision.to_timestamp('%Y-%m-%d').date())
.mutate(dm=_.WEEK_NUM)
.group_by([_.dm, _.target])
.aggregate(n=_.count())
.to_pandas()
)
# d
import altair as alt
c0=alt.Chart(
ds.get('train_base')
.mutate(date_decision=_
.date_decision
.to_timestamp('%Y-%m-%d').date()
.truncate('M')
)
.group_by([_.date_decision])
.aggregate(n=_.count())
.to_pandas()
,
title=['Loan Approval Volumn',
'There is a sharp drope during covid'
],height=150, width=670
).mark_line(color='midnightblue').encode(
x=alt.X('date_decision',title='Decision Date'),
y=alt.Y('n',title='Loan Approved')
)
c1=alt.Chart(d,title='Volumn of Approval and Deafult').mark_area().encode(
x=alt.X('dm',title='Week Number'),
y=alt.Y('n',title='Number of Case'),
color='target:N'
)
c2=alt.Chart(d,title='Propotion of Default Overtime').mark_area().encode(
x=alt.X('dm',title='Week Number'),
y=alt.Y('n',title='Propotion of Case').stack('normalize'),
color='target:N'
)
c0 & (c1 | c2)reload(tools)
from tools import FeatureDefinition
pretty_display(
FeatureDefinition().lookup_dsc('change')
.query('File=="train_static_0_0"')
)| Variable | File | Description | |
|---|---|---|---|
| 415 | equalitydataagreement_891L | train_static_0_0 | Flag indicating sudden changes in client's social- demographic data (e.g. education, family status, housing type). |
| 417 | equalityempfrom_62L | train_static_0_0 | Flag indicating a sudden change in the client's length of employment. |
Explore Categorical Variables
fd=FeatureDefinition()
intersting_cols = [
'equalitydataagreement_891L',
'equalityempfrom_62L',
'bankacctype_710L',
'credtype_322L','disbursementtype_67L',
'inittransactioncode_186L','lastst_736L','paytype1st_925L','paytype_783L',
'twobodfilling_608L','typesuite_864L',
'lastrejectreasonclient_4145040M'
]
# pretty_display(
# fd.lookup_col(intersting_cols)
# )
d2=(
ibis.union(
ds.get('train_static_0_0'),
ds.get('train_static_0_1')
)
.select(s.contains(intersting_cols + ['case_id']))
.join(ds.get('train_base'), ['case_id'])
)
C = []
for c in intersting_cols:
# display(d2.select(c,'target').head(1))
description = fd.lookup_col(c).Description.str.wrap(30).loc[0]
# print(description)
d=(d2
.group_by([c,'target'])
.aggregate(n=_.count())
.to_pandas())
ct1=alt.Chart(
d
).mark_bar().encode(
y=alt.Y(c + ':N', title=description.split('\n')),
x=alt.X('n:Q',title=''),
color='target:N'
)
ct2=alt.Chart(
d,width=100
).mark_bar().encode(
y=alt.Y(c + ':N', title='', axis=None),
x=alt.X('n:Q',title='').stack('normalize'),
color='target:N',
)
ct=ct1 | ct2
C += [ct]
reduce(alt.vconcat, C).configure_axisY(
titleAngle=0,
titleAlign="left",
titleY=10,
titleX=-200,
titleColor='#404040',
titleFontWeight='lighter'
).properties(
title=[
"Categorical Variable Appears in Static Dataset",
"Nothing distinct by propotion except 'Status of the client's previous application'"
]
)pretty_display(fd
.lookup_tbl('static_0_0')
.query('Variable.str.endswith("M")')
.reset_index(drop=True))| Variable | File | Description | |
|---|---|---|---|
| 0 | lastapprcommoditycat_1041M | train_static_0_0 | Commodity category of the last loan applications made by the applicant. |
| 1 | lastapprcommoditytypec_5251766M | train_static_0_0 | Commodity type of the last application. |
| 2 | lastcancelreason_561M | train_static_0_0 | Cancellation reason of the last application. |
| 3 | lastrejectcommoditycat_161M | train_static_0_0 | Category of commodity in the applicant's last rejected application. |
| 4 | lastrejectcommodtypec_5251769M | train_static_0_0 | Commodity type of the last rejected application. |
| 5 | lastrejectreason_759M | train_static_0_0 | Reason for rejection on the most recent rejected application. |
| 6 | lastrejectreasonclient_4145040M | train_static_0_0 | Reason for the client's last loan rejection. |
| 7 | previouscontdistrict_112M | train_static_0_0 | Contact district of the client's previous approved application. |
d=(
ds.get('train_static_0_0')
.join(ds.get('train_base'),'case_id')
.select(s.endswith('M'), _.target, _.case_id)
.drop('WEEK_NUM')
# .pivot_longer(
# s.endswith('M')
# )
# .group_by([_.name,_.value])
# .aggregate(n=_.count())
)(d
.select(c, 'target')
.group_by([s.contains(c),s.contains('target')])
.aggregate(n=_.count())
.to_pandas())| previouscontdistrict_112M | target | n | |
|---|---|---|---|
| 0 | P6_35_77 | 0 | 2506 |
| 1 | P41_138_103 | 0 | 1766 |
| 2 | P54_133_26 | 0 | 8831 |
| 3 | P197_47_166 | 0 | 34040 |
| 4 | P111_135_181 | 0 | 11957 |
| ... | ... | ... | ... |
| 393 | P159_160_144 | 1 | 27 |
| 394 | P217_60_135 | 1 | 16 |
| 395 | P78_30_175 | 1 | 26 |
| 396 | P7_110_89 | 1 | 18 |
| 397 | P31_42_128 | 1 | 7 |
398 rows × 3 columns
fd=FeatureDefinition()
mcols = [i for i in d.columns if i not in ['target', "case_id"]]
C=[]
for c in mcols:
data=(d
.select(c, 'target')
.group_by([s.contains(c),s.contains('target')])
.aggregate(n=_.count())
.to_pandas())
description=fd.lookup_col(c).Description.str.wrap(30).loc[0]
cl1=(
alt
.Chart(data)
.mark_bar()
.encode(
x='n:Q',
y=alt.Y(c + ':N',title=description.split('\n')),
color='target:N'
)
)
cl2=(
alt
.Chart(data,width=100)
.mark_bar(
)
.encode(
x=alt.X('n:Q',title='').stack('normalize'),
y=alt.Y(c + ':N',title='',axis=None),
color='target:N'
)
)
C+=[cl1 | cl2]
CH=reduce(alt.vconcat,C)
CH.configure_axisY(
titleAngle=0,
titleAlign="left",
titleY=10,
titleX=-250,
titleColor='#404040',
titleFontWeight='lighter'
).properties(
title='All Category Variable'
)# alt.data_transformers.enable("vegafusion")
# alt.Chart(d.to_pandas()).transform_fold(
# [i for i in d.columns if i not in ['target', "case_id"]]
# ).mark_bar().encode(
# x='count():Q',
# y='value:N',
# column='key:N'
# )
d┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓ ┃ lastapprcommoditycat_1041M ┃ lastapprcommoditytypec_5251766M ┃ lastcancelreason_561M ┃ lastrejectcommoditycat_161M ┃ lastrejectcommodtypec_5251769M ┃ lastrejectreason_759M ┃ lastrejectreasonclient_4145040M ┃ previouscontdistrict_112M ┃ target ┃ case_id ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩ │ string │ string │ string │ string │ string │ string │ string │ string │ int64 │ int64 │ ├────────────────────────────┼─────────────────────────────────┼───────────────────────┼─────────────────────────────┼────────────────────────────────┼───────────────────────┼─────────────────────────────────┼───────────────────────────┼────────┼─────────┤ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 0 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 2 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 3 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 6 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 7 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 8 │ │ a55475b1 │ a55475b1 │ P73_130_169 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 10 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 11 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 12 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 13 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └────────────────────────────┴─────────────────────────────────┴───────────────────────┴─────────────────────────────┴────────────────────────────────┴───────────────────────┴─────────────────────────────────┴───────────────────────────┴────────┴─────────┘